# Copyright 2025 Garena Online Private Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os

import fire
from openai import OpenAI

"""
Description:
    Evaluating the model question answering ability using llm-based analysis.

Example usage:
python answering_ability.py --file_name deepseek_v3_base.json
"""


def main(file_name: str = "deepseek_v3_base.json"):
    output = json.load(open(file_name))

    instruction = """I will send you a question and a long output generated by an LLM. Your task is to determine whether the output attempts to answer the question or not. The output may sometimes include irrelevant content, hallucinations, or random, off-topic responses.  

Please classify the output into one of the following categories:  

### **Output Format**:  

Your response must start with a **single integer** (0 or 1), followed by a **brief explanation**.  

- **Return 0** → The output is not trying to answer the question (e.g., irrelevant content, random talking, hallucinations).  
  *Example output:* `0: The response is off-topic and does not address the question.`  

- **Return 1** → The output attempts to answer the question, regardless of how complete or accurate the answer is.  
  *Example output:* `1: The response engages with the question, even if the answer is incomplete or incorrect.`  

**Question:** {question}  
**Model Output:** {response}  
"""

    # api key, model, and parameters
    os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
    client = OpenAI(
        api_key=os.environ.get(
            "OPENAI_API_KEY"
        ),  # This is the default and can be omitted
    )

    # choose LLM model and parameters
    llm_model = "gpt-4o-mini-2024-07-18"
    llm_temp = 0.0
    llm_max_tokens = 100

    question_completion = 0
    question_answering = 0
    question_other = 0

    print(f"Evaluating the question answering ability in {file_name}")
    for idx, o in enumerate(output):
        o["idx"] = idx

        # llm-based detection for question answering ability
        prompt = instruction.format(question=o[f"input"], response=o[f"output"])
        chat_completion = client.chat.completions.create(
            model=llm_model,
            temperature=llm_temp,
            max_tokens=llm_max_tokens,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        )
        response_text = chat_completion.choices[0].message.content

        # check the response: if 0 then question completion, if 1 then question answering
        if response_text.startswith("0"):
            question_completion += 1
        elif response_text.startswith("1"):
            question_answering += 1
        else:
            question_other += 1

        # append the result
        o["llm_detection"] = response_text

    # results
    res = {
        "question_completion": question_completion,
        "question_answering": question_answering,
        "other": question_other,
    }
    print(f"answering ability: {res}")

    # save the file
    file_name = file_name.replace(".json", "_ab.json")
    json.dump(
        output,
        open(f"{file_name}", "w"),
        indent=4,
    )


fire.Fire(main)
